#!/usr/bin/perl

my $host = "localhost";
my $username = "xxxxx";
my $database = "xxxxx";
my $password = "xxxxx";

my $dvobjectoffset = shift @ARGV;
my $filecatoffset = shift @ARGV;

unless ($dvobjectoffset > 0)
{
    print STDERR "Usage: ./files_source_ <DVOBJECT DB ID OFFSET> <FILECATID_OFFSET>\n";
    exit 1;
}

unless ($filecatoffset)
{
    print STDERR "WARNING! file category offset is set to ZERO.\n";
}

my $filecatid = $filecatoffset; # file categories (this is a new object in 4.0, so there are no 3.6 IDs to reuse)

use DBI;

my $dbh = DBI->connect("DBI:Pg:dbname=$database;host=$host",$username,$password); 

open PL, ">packlist.txt";

%STUDYMAP = {};
%STUDYFILEMAP = {};
%VERSIONMAP = {}; 

while ( <> )
{
    chop; 
    my ($globalid, $dsid, $dsvid, $dsvnum) = split("\t", $_); 
    $STUDYMAP{$globalid} = $dsid;
    $VERSIONMAP{$globalid . "+++" . $dsvnum} = $dsid . "-" . $dsvid; 

    %FILECATEGORIES = {}; # file categories for this dataset.

    if ($globalid =~/^([a-z]*):(.*)\/([^\/]*)$/)
    {
	$protocol = $1;
	$authority = $2; 
	$identifier = $3;

#	print $protocol . " " . $authority . " " . $identifier . "\n";
    }
    else 
    {
	print STDERR "WARNING! illegal global id: " . $globalid . "\n";
	next; 
    }

    my $sth; 

    $sth = $dbh->prepare(qq {SELECT s.id, v.id FROM study s, studyversion v WHERE v.study_id = s.id AND s.protocol = '$protocol' AND s.authority='$authority' AND s.studyid = '$identifier' AND v.versionnumber = $dsvnum}); 
    $sth->execute();

    my $vercount = 0; 

    my $sid; 
    my $svid; 

    while ( @foo = $sth->fetchrow() )
    {
	$sid = $foo[0];
	$svid = $foo[1];

	$vercount++;
    }

    $sth->finish; 

    unless ($vercount == 1) 
    {
	print STDERR "WARNING: invalid number of versions for study " . $globalid . ", with version number " . $dsvnum . " (" . $vercount . ")!\n";
	next; 
    }

    $sth = $dbh->prepare(qq {SELECT fm.label, fm.category, fm.description, sf.filetype, sf.filesystemlocation, sf.md5, sf.restricted, sf.originalfiletype, sf.unf, sf.id, sf.fileclass, fm.id FROM filemetadata fm, studyfile sf WHERE fm.studyfile_id = sf.id AND fm.studyversion_id = $svid});

    $sth->execute();

    my $newfile = 0; 

    while ( @foo = $sth->fetchrow() )
    {
	# new filemetadata fields: 
	$label = $foo[0];
	$description = $foo[2];
	$description =~s/\n/ /g;
	$description = $dbh->quote($description);
	# category: 
	$category = $foo[1];
	# new datafile fields: 
	$type = $foo[3];
	unless ($type =~m:/:)
	{
	    $type = "application/octet-stream";
	}
	$md5 = $foo[5];
	$restricted = $foo[6];

	# "restricted" is a boolean:
	
	$restricted = 'TRUE' if $restricted;
	$restricted = 'FALSE' unless $restricted;

	# location of the file, on the old filesystem: 
	$fslocation = $foo[4];
	$fslocation = "" unless $fslocation;
	
	# additional info for subsettable files: 
	# (will go into the new datatable)
	$originalfiletype = $foo[7];
	$unf = $foo[8]; 
	# id of the existing studyfile: 
	$sfid = $foo[9];
	# "class" of the existing studyfile: 
	# (tabular, "other", etc.)
	$fileclass = $foo[10];
	$fmid = $foo[11];

	if ($label =~/[\\\/:\*\?\"\<\>\|;\#]/)
	{
	    $preservedlabel = $label; 
	    $label=~s/[\\\/:\*\?\"\<\>\|;\#]//g;
	    
	    print STDERR "LABEL REPLACED: (FILEMETA: " . $fmid . ", FILE: " . $sfid . ", STUDY: " . $sid . ", VERSION: " . $svid . ", GLOBALID: " . $globalid . ") OLD: \"" . $preservedlabel . "\", NEW: \"" . $label . "\"\n";
	}

	if ($label eq '')
	{
	    $label = "UNKNOWN";
	}

	$label = $dbh->quote($label);


	unless ($STUDYFILEMAP{$sfid})
	{
	    $newfile = 1; 
	    # Certain things only need to be done once per file -
	    # namely, each file needs one dvobject and datafile each;
	    # same for the datatables and variables. 
	    # Other things, like filemetadatas, need to be created one 
	    # per version. 
	    
	    $newdatafileid = ($dvobjectoffset+$sfid);
	    $STUDYFILEMAP{$sfid} = $newdatafileid; 

	    $fsname = $fslocation; 

	    if ($fslocation =~/^http/ )
	    {
		$fsize = 0;
		$fmtime = &formatTimeStamp(time);
	    }
	    else 
	    {
		if ( -f $fslocation ) 
		{
		    @fstats = stat($fslocation); 
		    $fsize = $fstats[7]; 
		    $mtime = $fstats[9]; 

		    $fmtime = &formatTimeStamp($mtime);
		    $packlistentry = $fslocation; 
		    $packlistentry =~s/.*\/DVN\/data\///; 
		    print PL $packlistentry . "\n";
		}
		else 
		{
		    print STDERR "WARNING: file " . $fslocation . " not found!\n";
		    $fsize = 0;
		    $fmtime = &formatTimeStamp(time);
		}
		
		$fsname =~s/^.*\///g; 
	    }

	    # dvobject: 

	    print qq {INSERT INTO dvobject (id, dtype, owner_id, createdate, modificationtime) VALUES ($newdatafileid, 'DataFile', $dsid, '$fmtime', '$fmtime');} . "\n";

	    # datafile object:
	    $fsname = $dbh->quote($fsname);
	    print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, md5, restricted) VALUES ($newdatafileid, '$type', $fsname, $fsize, '$md5', $restricted);} . "\n";
	    # Use the below line instead of the above if you are using 4.6 or above
            # print qq {INSERT INTO datafile (id, contenttype, filesystemname, filesize, checksumtype, restricted,checksumvalue,rootdatafileid) VALUES ($newdatafileid, '$type', $fsname, $fsize, 'MD5', $restricted,'',-1);} . "\n";
	}
	else 
	{
	    $newdatafileid = $STUDYFILEMAP{$sfid};
	    $newfile = 0; 
	}
	    
	# file metadata object:
	print qq {INSERT INTO filemetadata (id, description, label, restricted, version, datasetversion_id, datafile_id) VALUES ($fmid, $description, $label, $restricted, 1, $dsvid, $newdatafileid);} . "\n";
	
	# and the category, if exists:

	if ($category && $category ne "") 
	{
	    $category = $dbh->quote($category); 
	    unless ($FILECATEGORIES{$category})
	    {
		# this is a new category (for this dataset), 
		# so it needs to be created: 

		$filecatid++;

		print qq{INSERT INTO datafilecategory (id, name, dataset_id) VALUES ($filecatid, $category, $newdatafileid);} . "\n";

		$FILECATEGORIES{$category} = $filecatid; 
	    }

	    my $fcid = $FILECATEGORIES{$category};
	    print qq{INSERT INTO filemetadata_datafilecategory (filecategories_id, filemetadatas_id) VALUES ($fcid, $fmid);} . "\n";

	}


	# subsettable files: 
	# (again, this only needs to be done once per file!)


	if ($newfile && ($fileclass eq "TabularDataFile"))
	{
	    #print STDERR "this is a subsettable file.\n";	

	    # NOTE: 
	    # there's only one datatable per file - make sure to only run this once!
	    # (i.e., not for every version!)
	    
	    $sth1 = $dbh->prepare(qq {SELECT id, varquantity, casequantity, unf, recordspercase FROM datatable WHERE studyfile_id = $sfid});

	    $sth1->execute();

	    $count = 0; 

	    while ( @dt = $sth1->fetchrow() )
	    {
		$dtid = $dt[0];
		$varquantity = $dt[1];
		$casequantity = $dt[2];
		$dtunf = $dt[3];
		$recordspercase = $dt[4];

		$count++;

		unless ($unf eq $dtunf) 
		{
		    print STDERR "WARNING: unf mismatch, between studyfile and datatable: " + $unf + ":" + $dtunf + "\n";
		}

		# datatable object:


		if ($recordspercase) 
		{
		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, recordspercase, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $recordspercase, $newdatafileid);} . "\n";
		}
		else
		{
		    print qq {INSERT INTO datatable (id, varquantity, casequantity, unf, originalfileformat, datafile_id) VALUES ($dtid, $varquantity, $casequantity, '$unf', '$originalfiletype', $newdatafileid);} . "\n";
		}
	    }

	    $sth1->finish;
	    
	    unless ($count == 1) 
	    {
		print STDERR "WARNING: invalid numbe of datatables: " + $count +".\n";
	    }
	    else 
	    {
		# variables:
		$sth1 = $dbh->prepare(qq {SELECT name, label, variableformattype_id, variableintervaltype_id, formatcategory, formatschema, formatschemaname, unf, fileorder, weighted, orderedfactor, numberofdecimalpoints, universe, filestartposition, fileendposition, recordsegmentnumber, id FROM datavariable WHERE datatable_id = $dtid});


		$sth1->execute();
		
		while ( @dv = $sth1->fetchrow() )
		{
		    $varname = $dv[0];
		    $varname = $dbh->quote($varname);
		    $varlabel = $dv[1];
		    $varlabel = $dbh->quote($varlabel);
		    $variableformattype_id = $dv[2];
		    # the old school formattype_id and 
		    # intervaltype_id need to be adjusted by 1, 
		    # to match the new enum values used in the
		    # 4.0 datavariables:
		    $variableformattype_id--;
		    $variableintervaltype_id = $dv[3];
		    $variableintervaltype_id--;
		    $varformatcategory = $dv[4];
		    $varformatschema = $dv[5];
		    $varformatschemaname = $dv[6];
		    $varunf = $dv[7];
		    $varfileorder = $dv[8];
		    $varweighted = $dv[9];
		    if ($varweighted)
		    {
			$varweighted = "TRUE";
		    }
		    else 
		    {
			$varweighted = "FALSE";
		    }
		    $varorderedfactor  = $dv[10];
		    if ($varorderedfactor)
		    {
			$varorderedfactor = "TRUE";
		    }
		    else 
		    {
			$varorderedfactor = "FALSE";
		    }

		    $varnumberofdecimalpoints = $dv[11];
		    $varuniverse = $dv[12];
		    $varfilestartposition = $dv[13];
		    $varfileendposition = $dv[14];
		    $varrecordsegmentnumber = $dv[15];
		    $varid = $dv[16];

		    

		    # new datavariable object: 

		    $newdvfields = "id, name, label, interval, type, unf, fileorder, orderedfactor, weighted, datatable_id";
		    $newdvvalues = qq {$varid, $varname, $varlabel, $variableintervaltype_id, $variableformattype_id, '$varunf', $varfileorder, $varorderedfactor, $varweighted, $dtid};

		    if ($varformatschemaname)
		    {
			# becomes "format":
			$newdvfields = $newdvfields . ", format";
			$newdvvalues = qq{$newdvvalues, '$varformatschemaname'};
		    }

		    if ($varformatcategory)
		    {
			$newdvfields = $newdvfields . ", formatcategory";
			$newdvvalues = qq{$newdvvalues, '$varformatcategory'};
		    }

		    if ($varfilestartposition)
		    {
			$newdvfields = $newdvfields . ", filestartposition";
			$newdvvalues = qq{$newdvvalues, $varfilestartposition};
		    }

		    if ($varfileendposition)
		    {
			$newdvfields = $newdvfields . ", fileendposition";
			$newdvvalues = qq{$newdvvalues, $varfileendposition};
		    }

		    if ($varrecordsegmentnumber)
		    {
			$newdvfields = $newdvfields . ", recordsegmentnumber";
			$newdvvalues = qq{$newdvvalues, $varrecordsegmentnumber};
		    }

		    if ($varuniverse)
		    {
			$newdvfields = $newdvfields . ", universe";
			$newdvvalues = qq{$newdvvalues, '$varuniverse'};
		    }

		    if ($varnumberofdecimalpoints)
		    {
			$newdvfields = $newdvfields . ", numberofdecimalpoints";
			$newdvvalues = qq{$newdvvalues, $numberofdecimalpoints};
		    }


		    print qq {INSERT INTO datavariable ($newdvfields) VALUES ($newdvvalues);} . "\n";

		    # variable categories: 
		    $sth2 = $dbh->prepare(qq {SELECT id, label, value, missing, catorder, frequency FROM variablecategory WHERE datavariable_id = $varid});
		    $sth2->execute();
		
		    while ( @vc = $sth2->fetchrow() )
		    {
			$varcatid = $vc[0];
			$varcatlabel = $vc[1];
			$varcatvalue = $vc[2];
			$varcatmissing = $vc[3];
			if ($varcatmissing)
			{
			    $varcatmissing = "true";
			}
			else 
			{
			    $varcatmissing = "false";
			}
			$varcatorder = $vc[4];
			unless ($varcatorder) 
			{
			    if ($varcatorder eq "" || $varcatorder != 0)
			    {
				$varcatorder = "null";
			    }
			}
			$varcatfreq = $vc[5];
			unless ($varcatfreq) 
			{
			    if ($varcatfreq eq "" || $varcatfreq != 0)
			    {
				$varcatfreq = "null";
			    }
			}


			# only migrate the *real* categories: 
			if ($varcatlabel) 
			{
			    $varcatlabel = $dbh->quote($varcatlabel);
			    unless ($varcatvalue || ($varcatvalue eq "") || ($varcatvalue == 0))
			    {
				print STDERR qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
			    }
			    else 
			    {
				$varcatvalue = $dbh->quote($varcatvalue);
				print qq {INSERT INTO variablecategory (id, label, value, missing, catorder, frequency, datavariable_id) VALUES ($varcatid, $varcatlabel, $varcatvalue, $varcatmissing, $varcatorder, $varcatfreq, $varid);} . "\n";
			    }
			}
			else 
			{
			    #print STDERR "empty var cat label.\n";
			}
		    }

		    $sth2->finish;
		}

		$sth1->finish;
	    }


		
	}
    }

    $sth->finish; 

}

# Now, the guestbooks/download activity etc.:

# guest books from the old "studyfile activity" entries:

$sth = $dbh->prepare(qq {SELECT a.downloadcount, a.lastdownloadtime, a.studyfile_id, s.authority, s.studyid, s.protocol FROM studyfileactivity a, study s WHERE a.study_id = s.id AND a.downloadcount > 0}); 
$sth->execute();

$id = 1000; 

while ( @foo = $sth->fetchrow() )
{
    my $acount = $foo[0];
    my $adownloadtime = $foo[1];
    $adownloadtime = "TIMESTAMP " . $dbh->quote($adownloadtime) if $adownloadtime;
    $adownloadtime = "NULL" unless $adownloadtime;

    my $astudyfile_id = $foo[2];
    my $sauthority = $foo[3];
    my $sidentifier = $foo[4];
    my $sprotocol = $foo[5];

    next unless $astudyfile_id; 
    $astudyfile_id += $dvobjectoffset;

    my $globalid = $sprotocol . ":" . $sauthority . "/" . $sidentifier; 

    unless ($STUDYMAP{$globalid})
    {
        next; 
    }
    
    ($dataset_id) = $STUDYMAP{$globalid};
        
    for ($i = 0; $i < $acount; $i++)
    {
        $id++; 

        if ($i == $acount - 1) 
        {
            $downloadtime = $adownloadtime;
        }
        else 
        {
            $downloadtime = "NULL";
        }

        print qq {INSERT INTO guestbookresponse (id, email, name, institution, position, responsetime, guestbook_id, datafile_id, authenticateduser_id, downloadtype, sessionid, dataset_id, datasetversion_id) VALUES ($id, NULL, 'unknown', NULL, NULL, $downloadtime, 1, $astudyfile_id, NULL, 'download', NULL, $dataset_id, NULL);} . "\n";
    }

}

$guestbook_response_id_offset = $id; 

# Migrating guestbooks: 

$sth = $dbh->prepare(qq {SELECT id, emailrequired, enabled, firstnamerequired, lastnamerequired, institutionrequired, positionrequired, vdc_id FROM guestbookquestionnaire}); 
$sth->execute();

while ( @foo = $sth->fetchrow() )
{
    my $gid = $foo[0];

    if ($gid == 1)
    {
        # print STDERR "found guestbook with id=1!\n";
	# This is the default guestbook; we don't need to migrate it, as the 
	# new Dataverse 4.0 will have its own default guestbook.
        next;
    }

    my $gemailrequired = $foo[1] ? "TRUE" : "FALSE";
    my $genabled = $foo[2] ? "TRUE" : "FALSE";

    my $gnamerequired = ($foo[3] || $foo[4]) ? "TRUE" : "FALSE";

    my $ginstitutionrequired = $foo[5] ? "TRUE" : "FALSE";
    my $gpositionrequired = $foo[6] ? "TRUE" : "FALSE";

    my $gdataverse_id = $foo[7] + 9; 

    print qq {INSERT INTO guestbook (id, createtime, emailrequired, enabled, institutionrequired, name, namerequired, positionrequired, dataverse_id) VALUES ($gid, TIMESTAMP '1970-01-01 00:00:00', $gemailrequired, $genabled, $ginstitutionrequired, '', $gnamerequired, $gpositionrequired, $gdataverse_id);} . "\n";

}

# Finally, migrating guestbook responses:

$sth = $dbh->prepare(qq {SELECT r.id, r.email, r.firstname, r.institution, r.lastname, r.position, r.responsetime, r.guestbookquestionnaire_id, r.studyfile_id, r.vdcuser_id, r.downloadtype, r.sessionid, s.authority, s.studyid, s.protocol, v.versionnumber FROM guestbookresponse r, studyversion v, study s WHERE r.study_id = s.id AND r.studyversion_id = v.id}); 
$sth->execute();


while ( @foo = $sth->fetchrow() )
{
    my $rid = $foo[0];

    $rid += $guestbook_response_id_offset;

    my $remail = $dbh->quote($foo[1]);
    my $rfirstname = $foo[2];
    my $rinstitution = $dbh->quote($foo[3]);
    my $rlastname = $foo[4];
    my $rposition = $dbh->quote($foo[5]);
    my $rresponsetime = $foo[6];
    $rresponsetime = "TIMESTAMP " . $dbh->quote($rresponsetime) if $rresponsetime;
    $rresponsetime = "NULL" unless $rresponsetime;
    my $rgbqid = $foo[7];
    my $rstudyfileid = $foo[8];
    $rstudyfileid+=$dvobjectoffset;
    my $rvdcuserid = $foo[9] ? $foo[9] : "NULL";
    my $rdownloadtype = $dbh->quote($foo[10]);
    my $rsessionid = $dbh->quote($foo[11]);

    my $sauthority = $foo[12];
    my $sidentifier = $foo[13];
    my $sprotocol = $foo[14];
    my $vversionnumber = $foo[15];


    my $globalid = $sprotocol . ":" . $sauthority . "/" . $sidentifier; 

    unless ($VERSIONMAP{$globalid . "+++" . $vversionnumber})
    {
        print STDERR "WARNING: No entry for " . $globalid .  "+++" . $vversionnumber. "!\n";
        next; 
    }
    
    ($dataset_id, $datasetversion_id) = split ("\-", $VERSIONMAP{$globalid . "+++" . $vversionnumber});
        
    unless ($dataset_id > 0 && $datasetversion_id > 0)
    {
        print STDERR "Invalid entry for " . $globalid .  "+++" . $vversionnumber. ": " . $VERSIONMAP{$globalid . "+++" . $vversionnumber} . "!\n";
        next;
    }
    
    my $name = "";
    $name = $rfirstname . " " if $rfirstname;
    $name .= $rlastname if $rlastname; 
    $name = $dbh->quote($name) if $name; 
    $name = "NULL" unless $name; 
    
    print qq {INSERT INTO guestbookresponse (id, email, name, institution, position, responsetime, guestbook_id, datafile_id, authenticateduser_id, downloadtype, sessionid, dataset_id, datasetversion_id) VALUES ($rid, $remail, $name, $rinstitution, $rposition, $rresponsetime, $rgbqid, $rstudyfileid, $rvdcuserid, $rdownloadtype, $rsessionid, $dataset_id, $datasetversion_id);} . "\n";

}



$dbh->disconnect; 

close PL; 

exit 0; 

sub formatTimeStamp () {
    my ($mtime) = (@_);
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($mtime);

    $year+=1900;
    $mon++;

    $fmt = $year . "-" . sprintf("%02d",$mon) . "-" . sprintf("%02d",$mday) . " " . 
	sprintf("%02d", $hour) . ":" . sprintf("%02d",$min) . ":" . sprintf("%02d",$sec); 

    return $fmt;
}





  
